# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dalex as dx
import warnings
warnings.filterwarnings('ignore')
import plotly
plotly.offline.init_notebook_mode()
pd.set_option('display.max_columns', None)
df = pd.read_csv('hotel_bookings.csv')
df.head()
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | meal | country | market_segment | distribution_channel | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | reserved_room_type | assigned_room_type | booking_changes | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 3 | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 4 | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Direct | Direct | 0 | 0 | 0 | A | C | 0 | No Deposit | NaN | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Corporate | Corporate | 0 | 0 | 0 | A | A | 0 | No Deposit | 304.0 | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | 0.0 | 0 | BB | GBR | Online TA | TA/TO | 0 | 0 | 0 | A | A | 0 | No Deposit | 240.0 | NaN | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 2015-07-03 |
df.shape
(119390, 32)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 119390 entries, 0 to 119389 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 hotel 119390 non-null object 1 is_canceled 119390 non-null int64 2 lead_time 119390 non-null int64 3 arrival_date_year 119390 non-null int64 4 arrival_date_month 119390 non-null object 5 arrival_date_week_number 119390 non-null int64 6 arrival_date_day_of_month 119390 non-null int64 7 stays_in_weekend_nights 119390 non-null int64 8 stays_in_week_nights 119390 non-null int64 9 adults 119390 non-null int64 10 children 119386 non-null float64 11 babies 119390 non-null int64 12 meal 119390 non-null object 13 country 118902 non-null object 14 market_segment 119390 non-null object 15 distribution_channel 119390 non-null object 16 is_repeated_guest 119390 non-null int64 17 previous_cancellations 119390 non-null int64 18 previous_bookings_not_canceled 119390 non-null int64 19 reserved_room_type 119390 non-null object 20 assigned_room_type 119390 non-null object 21 booking_changes 119390 non-null int64 22 deposit_type 119390 non-null object 23 agent 103050 non-null float64 24 company 6797 non-null float64 25 days_in_waiting_list 119390 non-null int64 26 customer_type 119390 non-null object 27 adr 119390 non-null float64 28 required_car_parking_spaces 119390 non-null int64 29 total_of_special_requests 119390 non-null int64 30 reservation_status 119390 non-null object 31 reservation_status_date 119390 non-null object dtypes: float64(4), int64(16), object(12) memory usage: 29.1+ MB
# Replace missing values:
# agent: If no agency is given, booking was most likely made without one.
# company: If none given, it was most likely private.
# rest schould be self-explanatory
nan_replacements = {"children:": 0.0, "country": "Unknown", "agent": 0, "company": 0}
df = df.fillna(nan_replacements)
# "meal" contains values "Undefined", which is equal to SC.
df["meal"].replace("Undefined", "SC", inplace=True)
# Some rows contain entreis with 0 adults, 0 children and 0 babies.
# I'm dropping these entries with no guests.
zero_guests = list(df.loc[df["adults"] + df["children"] + df["babies"]==0].index)
df.drop(df.index[zero_guests], inplace=True)
# feature engineering
df["adr_pp"] = df["adr"] / (df["adults"] + df["children"])
df["total_nights"] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"]
# manually choose columns to include
# some columns are excluded to make the model more general and to prevent leakage
# (arrival_date_year, assigned_room_type, booking_changes, reservation_status, country,
# days_in_waiting_list, hotel)
# including the country would increase accuracy, but it may also make the model less general and make not fair
num_features = ["lead_time","arrival_date_week_number","arrival_date_day_of_month",
"stays_in_weekend_nights", "stays_in_week_nights", "total_nights",
"adults","children", "babies",
"is_repeated_guest", "previous_cancellations", "previous_bookings_not_canceled",
"agent","company",
"required_car_parking_spaces", "total_of_special_requests", "adr", "adr_pp"]
cat_features = ["arrival_date_month", "meal",
"market_segment", "distribution_channel","reserved_room_type","deposit_type","customer_type"]
# Separate features and predicted value
features = num_features + cat_features
# separate features and target
X = df.drop(["is_canceled"], axis=1)[features]
y = df["is_canceled"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
For the simplicity models were separately trained in another notebook, here we only load trained models.
import pickle
dt_model = pickle.load(open('models/decision_tree', 'rb'))
rf_model = pickle.load(open('models/random_forest', 'rb'))
xgb_model = pickle.load(open('models/xgb', 'rb'))
Let's check which explanatory variables are important in general for model's predictions. For this purpose, we can use Permutational Variable Importance. First let's create explainer object for each trained model.
dt_explainer = dx.Explainer(dt_model, X_test, y_test)
rf_explainer = dx.Explainer(rf_model, X_test, y_test)
xgb_explainer = dx.Explainer(xgb_model, X_test, y_test)
Preparation of a new explainer is initiated -> data : 23842 rows 25 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 23842 values -> model_class : sklearn.tree._classes.DecisionTreeClassifier (default) -> label : Not specified, model's class short name will be used. (default) -> predict function : <function yhat_proba_default at 0x000001BA1BE1C040> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.0, mean = 0.371, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -1.0, mean = 0.00164, max = 1.0 -> model_info : package sklearn A new explainer has been created! Preparation of a new explainer is initiated -> data : 23842 rows 25 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 23842 values -> model_class : sklearn.ensemble._forest.RandomForestClassifier (default) -> label : Not specified, model's class short name will be used. (default) -> predict function : <function yhat_proba_default at 0x000001BA1BE1C040> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.0, mean = 0.374, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.929, mean = -0.00165, max = 1.0 -> model_info : package sklearn A new explainer has been created! Preparation of a new explainer is initiated -> data : 23842 rows 25 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 23842 values -> model_class : xgboost.sklearn.XGBClassifier (default) -> label : Not specified, model's class short name will be used. (default) -> predict function : <function yhat_proba_default at 0x000001BA1BE1C040> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 4.57e-06, mean = 0.369, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.998, mean = 0.00319, max = 1.0 -> model_info : package sklearn A new explainer has been created!
To calculate the variable-importance measure, we use the model_parts() method. By default it performs B = 10 permutations of variable importance calculated on N = 1000 observations.
pvi_dt = dt_explainer.model_parts() # default measure 1- AUC
pvi_rf = rf_explainer.model_parts()
pvi_xgb = xgb_explainer.model_parts()
Let's look at the few most important variables.
pvi_dt.plot(max_vars=None)
pvi_rf.plot(max_vars=None)
pvi_xgb.plot(max_vars=None)
# pvi_dt.plot([pvi_rf, pvi_xgb])
Now we can compare variable importance measures obtained in this way between different models. The plot suggests that the most important explanatory variable for all three models is deposit type, for which importance value is significantly greater than for other features. It may be related with the fact discovered during the EDA and ilustrated below. Customers, who made not refundable deposit of their booking nearly always cancel their reservations, what seems to be unintuitive and might be a potential mistake in the data, made during the process of annotating. Other variables with high values for the inspected models are total_of_special_requirements, lead_time, customer_type and market_segment. The more additional requirements customers have the less likely they cancel a reservation and silmilarly the shorter period between the booking and the planed arrival date is the less probable cancelation is made. We may conlude that all three models have similar set of the most important variables. It's seems to be intuitive, because all explained models are tree-based.
We are also able to notice that the variables that don't influence a model's predictions so much are nearly identical for all three models. We may exclude adults, children, babies and is_repeated_guest from our models in order to simplify them. But we have to have in mind that unimportance of this variables is probably related with the fact that eg. most of the customers in the training data wasn't a repeated guests or didn't have children or babies.
data_to_plot = df.groupby("deposit_type")["is_canceled"].mean()
plt.figure(figsize=(8, 6))
sns.barplot(x=data_to_plot.index, y=data_to_plot.values, color="steelblue")
plt.xlabel("Deposit type", fontsize=16)
plt.ylabel("Cancelations", fontsize=16)
plt.title("Effect of deposit type on cancelation", fontsize=16)
plt.show()